# netowrks
import networkx as nx
import igraph as ig
# data processing
import pandas as pd
import numpy as np
import random
#some functions to make our lifes easier
import sys
sys.path.append("../")
from common_functions import *
# viz
#import pylab as plt
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from scipy.sparse import diags, csr_matrix, linalg
from sklearn.cluster import KMeans
import time
import directedlouvain as dl
from collections import defaultdict, Counter
# Path to file
file_path = 'soc-Epinions1.txt'
# Initialize a directed graph
G = nx.DiGraph()
# Load the edge list into the directed graph
with open(file_path, 'r') as f:
# Skip header lines that start with '#'
edges = [line.strip().split('\t') for line in f if not line.startswith('#')]
# Add edges to the graph
G.add_edges_from((int(src), int(dst)) for src, dst in edges)
def graph_stats(G):
is_directed = G.is_directed()
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
# Extract the largest Weakly Connected Component (WCC)
largest_wcc = max(nx.weakly_connected_components(G), key=len)
G_wcc = G.subgraph(largest_wcc).copy()
wcc_num_nodes = G_wcc.number_of_nodes()
wcc_num_edges = G_wcc.number_of_edges()
# Extract the largest Strongly Connected Component (SCC)
largest_scc = max(nx.strongly_connected_components(G), key=len)
G_scc = G.subgraph(largest_scc).copy()
scc_num_nodes = G_scc.number_of_nodes()
scc_num_edges = G_scc.number_of_edges()
avg_clustering_coeff = nx.average_clustering(G)
if is_directed:
# Calculate directed triangles
num_triangles = 0
for u in G:
for v in G.successors(u):
for w in G.successors(v):
if G.has_edge(w, u): # Check for the cycle u -> v -> w -> u
num_triangles += 1
num_triangles = num_triangles // 3 # Each triangle is counted three times
# Calculate diameter on the largest SCC
try:
diameter = nx.diameter(G_wcc)
except nx.NetworkXError:
diameter = float('inf') # If no diameter can be computed
assortativity = nx.degree_assortativity_coefficient(G_wcc)
else:
# Undirected graph
num_triangles = sum(nx.triangles(G).values()) // 3
diameter = nx.diameter(G)
assortativity = nx.degree_assortativity_coefficient(G)
# Print statistics
print("Is Directed:", is_directed)
print("Total Nodes:", num_nodes)
print("Total Edges:", num_edges)
print("Nodes in largest WCC:", wcc_num_nodes)
print("Edges in largest WCC:", wcc_num_edges)
print("Nodes in largest SCC:", scc_num_nodes)
print("Edges in largest SCC:", scc_num_edges)
print("Average clustering coefficient:", avg_clustering_coeff)
print("Number of triangles:", num_triangles)
print("Diameter (longest shortest path):", diameter)
print("Assortatativity Coefficient:", assortativity)
graph_stats(G)
Is Directed: True Total Nodes: 75879 Total Edges: 508837 Nodes in largest WCC: 75877 Edges in largest WCC: 508836 Nodes in largest SCC: 32223 Edges in largest SCC: 443506 Average clustering coefficient: 0.11017387558244757 Number of triangles: 740310 Diameter (longest shortest path): inf Assortatativity Coefficient: 0.06389680395789947
def compute_degree(graph):
# Compute in-degree and out-degree for each node
degree_data = {
"Node": [],
"In-Degree": [],
"Out-Degree": []
}
degrees = []
for node in graph.nodes:
in_degree = graph.in_degree(node)
out_degree = graph.out_degree(node)
degree_data["Node"].append(node)
degree_data["In-Degree"].append(in_degree)
degree_data["Out-Degree"].append(out_degree)
degrees.append(in_degree+out_degree)
print('Mean degree:', statistics.mean(degrees))
print('Median degree:', statistics.median(degrees))
del degrees
#Create an adjacency list from G
AdjG = nx.to_dict_of_lists(graph)
# Compute the log log degree distribution
out_degrees = {node: len(neighbors) for node, neighbors in AdjG.items()}
edge_list = [(source, target) for source, targets in AdjG.items() for target in targets]
in_degree_counts = Counter(target for _, target in edge_list)
out_degree_distribution = Counter(out_degrees.values())
in_degree_distribution = Counter(in_degree_counts.values())
# Plotting
fig, axes = plt.subplots(2, 1, figsize=(8, 8))
# Out-degree distribution
axes[0].scatter(out_degree_distribution.keys(), out_degree_distribution.values(), color="black", s=10)
axes[0].set_xscale("log")
axes[0].set_yscale("log")
axes[0].set_title("Log-Log Out-Degree Distribution")
axes[0].set_xlabel("Degree (log scale)")
axes[0].set_ylabel("Number of Nodes (log scale)")
axes[0].grid(True, which="both", linestyle="--", linewidth=0.5, alpha=0.7)
# In-degree distribution
axes[1].scatter(in_degree_distribution.keys(), in_degree_distribution.values(), color="black", s=10)
axes[1].set_xscale("log")
axes[1].set_yscale("log")
axes[1].set_title("Log-Log In-Degree Distribution")
axes[1].set_xlabel("Degree (log scale)")
axes[1].set_ylabel("Number of Nodes (log scale)")
axes[1].grid(True, which="both", linestyle="--", linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()
return degree_data
degree_data = compute_degree(G)
Mean degree: 13.411800366372777 Median degree: 2
# Convert results to a pandas DataFrame for easier analysis and viewing
degree_df = pd.DataFrame(f'degree_data/{degree_data}')
degree_df.head() # Display the first few rows
# Save the DataFrame to a CSV file
degree_df.to_csv('graph_degrees.csv', index=False)
def find_hubs(degree_data, top_n=5, threshold=600):
"""
Identify top hubs by in-degree and out-degree.
Args:
degree_data (dict): A dictionary containing "Node", "In-Degree", and "Out-Degree".
top_n (int): Number of top nodes to return by in-degree and out-degree.
threshold (int): Minimum in-degree and out-degree for filtering hubs.
Returns:
dict: A dictionary with lists of hubs:
{
"top_in_degree": [(node, in_degree, out_degree)],
"top_out_degree": [(node, in_degree, out_degree)],
"top_in_out_degree": [(node, in_degree, out_degree)]
}
"""
node_data = list(zip(degree_data["Node"], degree_data["In-Degree"], degree_data["Out-Degree"]))
# Sort by In-Degree in descending order and take the top N
top_in_degree = sorted(node_data, key=lambda x: x[1], reverse=True)[:top_n]
# Sort by Out-Degree in descending order and take the top N
top_out_degree = sorted(node_data, key=lambda x: x[2], reverse=True)[:top_n]
# Nodes with both In-Degree and Out-Degree above the threshold
top_in_out_degree = [
(node, in_degree, out_degree)
for node, in_degree, out_degree in node_data
if in_degree > threshold and out_degree > threshold
]
return {
"top_in_degree": top_in_degree,
"top_out_degree": top_out_degree,
"top_in_out_degree": top_in_out_degree
}
hubs = find_hubs(degree_data, 20, threshold=600)
# Results can also be used programmatically
print("Top by In-Degree:", hubs["top_in_degree"])
print("Top by Out-Degree:", hubs["top_out_degree"])
print("Nodes above threshold:", hubs["top_in_out_degree"])
Top by In-Degree: [(18, 3035, 44), (143, 1521, 171), (737, 1317, 372), (790, 1284, 102), (136, 1180, 111)] Top by Out-Degree: [(645, 408, 1801), (763, 293, 1669), (634, 378, 1621), (71399, 116, 1128), (3924, 92, 976)] Nodes above threshold: [(44, 672, 843)]
def analize_hubs(G, node_a, node_b):
# Check if the hubs are connected
if G.has_edge(node_a, node_b):
print(f'The hubs {node_a} and {node_b} are directly connected.')
else:
print(f'The hubs {node_a} and {node_b} are not directly connected.')
# If not directly connected, find the shortest path (if it exists)
try:
path = nx.shortest_path(G, node_a, node_b)
print(f'The shortest path between {node_a} and {node_b} is: {path}')
except nx.NetworkXNoPath:
print(f'There is no path between {node_a} and {node_b}.')
# Get the predecessors of the target nodes
predecessors_a = set(G.predecessors(node_a))
predecessors_b = set(G.predecessors(node_b))
# Find nodes that have outgoing edges to both target nodes
common_predecessors = predecessors_a & predecessors_b
# Output the result
print(f'There are {len(common_predecessors)} nodes with outgoing edges to both {node_a} and {node_b}: {common_predecessors}')
analize_hubs(G,18,143)
The hubs 18 and 143 are not directly connected.
The shortest path between 18 and 143 is: [18, 128, 143]
There are 588 nodes with outgoing edges to both 18 and 143: {4100, 2055, 2057, 4110, 6159, 16, 10259, 2069, 27, 31, 16416, 33, 36, 2084, 39, 2088, 2097, 2103, 58, 62, 65, 67, 4165, 4167, 6215, 77, 79, 82, 83, 85, 4182, 10330, 91, 4187, 4188, 8288, 2144, 10338, 99, 12388, 4197, 14438, 4199, 2152, 4200, 4202, 107, 2157, 112, 18544, 119, 2167, 71801, 8316, 124, 126, 128, 8321, 10378, 140, 141, 2189, 6285, 145, 146, 8344, 2201, 172, 10413, 2226, 37043, 2231, 2232, 18620, 198, 4295, 12496, 212, 215, 217, 4313, 2270, 223, 225, 2277, 6374, 8423, 232, 2279, 30953, 6378, 4333, 4334, 239, 2287, 6382, 8434, 6383, 6393, 6397, 6402, 2308, 14600, 4368, 6419, 2329, 290, 291, 4394, 8491, 24877, 8495, 4406, 318, 320, 2369, 4418, 329, 340, 342, 343, 6487, 348, 8541, 356, 2410, 375, 379, 45439, 384, 385, 57728, 2433, 4482, 6527, 8592, 6555, 418, 8613, 2470, 2471, 431, 4527, 4537, 443, 445, 447, 14783, 4547, 20933, 2504, 460, 12750, 4564, 2525, 8671, 485, 6629, 2538, 8684, 2548, 2549, 4596, 2553, 508, 4606, 6656, 518, 12807, 8712, 12809, 12810, 8716, 2572, 2574, 12821, 8732, 547, 549, 552, 559, 6703, 18997, 2614, 570, 2623, 576, 4679, 16968, 586, 4685, 592, 2644, 2646, 10838, 29272, 6744, 602, 2651, 2653, 606, 607, 2656, 10846, 4706, 624, 625, 629, 630, 4725, 634, 6780, 637, 4736, 6784, 644, 645, 649, 8842, 12939, 6793, 653, 2704, 2706, 661, 663, 2713, 670, 671, 6814, 8867, 6820, 8871, 6824, 2729, 6829, 686, 2736, 697, 10937, 2750, 708, 6852, 733, 2786, 4834, 2788, 748, 749, 2796, 751, 759, 762, 4862, 775, 2826, 8974, 21262, 8976, 788, 789, 790, 8989, 805, 808, 13104, 6960, 4914, 6962, 821, 824, 2872, 15161, 828, 831, 4929, 21320, 2889, 4941, 4944, 4949, 29529, 862, 4962, 11114, 2931, 898, 19330, 908, 909, 9100, 19343, 7059, 2970, 925, 5027, 2980, 2992, 2995, 955, 960, 5056, 3011, 5060, 965, 970, 7114, 19406, 988, 3039, 997, 13289, 1006, 15350, 1017, 1025, 13325, 1059, 5164, 3117, 1075, 1077, 5175, 1084, 5181, 3138, 3142, 9306, 3174, 19561, 7273, 5229, 7289, 19583, 5248, 5263, 13455, 5266, 1171, 1173, 11413, 3226, 3234, 17581, 1209, 23738, 5309, 3262, 3268, 5318, 1224, 1225, 3277, 3278, 1234, 1238, 11479, 15575, 7394, 1252, 1254, 3306, 7408, 17649, 7418, 11521, 1283, 7429, 7430, 1288, 7435, 5393, 13588, 3352, 5412, 11560, 11568, 5425, 3379, 5427, 5428, 3386, 19790, 25939, 3412, 3418, 11616, 3425, 1379, 3431, 9582, 1404, 7549, 5503, 5505, 5510, 25993, 5514, 5518, 1423, 3474, 1428, 1434, 5534, 1440, 3488, 1443, 1445, 19885, 3502, 1455, 3507, 1460, 3514, 3515, 1469, 1471, 1473, 1474, 3523, 1480, 1483, 5580, 1491, 1492, 7637, 1494, 7643, 1501, 1505, 3557, 1510, 3564, 1517, 1518, 11756, 1520, 1522, 24052, 11765, 1526, 1528, 1529, 1532, 3582, 3584, 3586, 1539, 1541, 9738, 3594, 17933, 1551, 1552, 9746, 9748, 1562, 1564, 9762, 5667, 5679, 15941, 3673, 1626, 11868, 1629, 5724, 3679, 1634, 3683, 1636, 1647, 1650, 1657, 20090, 1661, 1663, 1665, 1669, 26247, 7817, 5776, 1683, 1684, 5782, 1690, 1692, 1694, 5792, 22180, 1701, 1702, 16036, 63147, 1716, 26292, 1718, 5812, 1720, 5813, 3774, 1727, 7872, 3780, 5831, 5832, 7881, 1738, 1748, 7896, 1754, 7900, 1767, 71400, 71403, 71404, 1778, 3827, 3828, 3830, 7935, 7949, 3854, 20241, 10002, 5908, 5916, 22300, 5918, 5917, 1831, 3879, 3880, 16169, 1835, 1836, 3887, 5939, 5941, 1846, 1847, 1849, 3898, 5951, 5952, 1857, 1859, 1860, 1863, 1865, 1866, 1867, 5967, 3921, 3923, 1876, 3925, 1885, 8032, 5992, 3952, 6001, 1912, 16250, 8060, 1918, 3968, 3985, 4001, 4003, 6053, 6055, 4010, 4017, 4038, 4040, 4045, 16334, 4051, 8149, 8158, 6114, 20452, 4080, 2033, 2037, 2043, 8188}
# Degree centrality
degree_centrality = nx.degree_centrality(G)
print("Top 20 nodes by degree centrality:", sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:20])
Top 20 nodes by degree centrality: [(18, 0.04057829673950289), (645, 0.029112522733862254), (634, 0.026344922111811067), (763, 0.02585729724030681), (143, 0.022298953583383855), (737, 0.022259416431640266), (44, 0.019966261630512138), (790, 0.018266164105537837), (34, 0.01730409341311052), (136, 0.017014154300324207), (1179, 0.01648699227707636), (71399, 0.01639473892300799), (27, 0.01618387411370885), (1719, 0.01563035398929861), (1516, 0.014931864308495217), (118, 0.01485279000500804), (1, 0.01478689475210206), (637, 0.014523313740478137), (3924, 0.014075226020717467), (40, 0.013903898363161919)]
def get_core_nodes(graph, threshold):
"""
Recursively removes nodes with in-degree + out-degree < threshold and returns the core graph.
Parameters:
- graph: A NetworkX DiGraph (directed graph)
Returns:
- A NetworkX DiGraph representing the core of the network
"""
# Create a copy of the graph to avoid modifying the original
core_graph = graph.copy()
count = 0
number_of_removed = 0
while True:
# Identify nodes with in-degree + out-degree < threshold
nodes_to_remove = [
node for node in core_graph.nodes
if core_graph.in_degree(node) + core_graph.out_degree(node) < threshold
]
# Exit condition - if no nodes meet the condition, stop the process
if not nodes_to_remove:
break
# Remove the identified nodes
core_graph.remove_nodes_from(nodes_to_remove)
number_of_removed += len(nodes_to_remove)
print(f'Iteration {count}: removed {len(nodes_to_remove)} nodes')
count += 1
print(f'Executed {count} iterations, removed {number_of_removed} nodes')
return core_graph
# Get the core graph
threshold = 4
core_G = get_core_nodes(G, threshold)
# Print the number of nodes and edges in the original and core graphs
print(f"Original Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"Core Graph: {core_G.number_of_nodes()} nodes, {core_G.number_of_edges()} edges")
nx.write_edgelist(core_G, "core_g_edge_list.txt", data=False)
Iteration 0: removed 49243 nodes Iteration 1: removed 2772 nodes Iteration 2: removed 309 nodes Iteration 3: removed 44 nodes Iteration 4: removed 8 nodes Executed 5 iterations, removed 52376 nodes Original Graph: 75879 nodes, 508837 edges Core Graph: 23503 nodes, 431564 edges
graph_stats(core_G)
Is Directed: True Total Nodes: 433 Total Edges: 34213 Nodes in largest WCC: 433 Edges in largest WCC: 34213 Nodes in largest SCC: 432 Edges in largest SCC: 34096 Average clustering coefficient: 0.30159020719278284 Number of triangles: 246277 Diameter (longest shortest path): 3
plot_network(core_G, with_labels=False)
The core graph has the 30,87% of node of the original graph. Let's calculate mean and median degree.
core_degree_data = compute_degree(core_G)
Mean degree: 36.72416287282475 Median degree: 10
Mean and median are higher respect the mean and the median of the original graph (plausible)
# Convert results to a pandas DataFrame for easier analysis and viewing
degree_df = pd.DataFrame(f'degree_data/{core_degree_data}')
degree_df.head() # Display the first few rows
# Save the DataFrame to a CSV file
degree_df.to_csv('core_node_degrees.csv', index=False)
core_hubs = find_hubs(core_degree_data, 20, threshold=600)
# Results can also be used programmatically
print("Top by In-Degree:", core_hubs["top_in_degree"])
print("Top by Out-Degree:", core_hubs["top_out_degree"])
print("Nodes above threshold:", core_hubs["top_in_out_degree"])
Top by In-Degree: [(18, 2761, 44), (143, 1438, 143), (737, 1199, 370), (790, 1193, 102), (1179, 1119, 85)] Top by Out-Degree: [(645, 390, 1632), (634, 351, 1350), (763, 251, 1053), (71399, 109, 932), (3924, 92, 901)] Nodes above threshold: [(44, 627, 808)]
analize_hubs(core_G, 18, 143)
The hubs 18 and 143 are not directly connected.
The shortest path between 18 and 143 is: [18, 128, 143]
There are 588 nodes with outgoing edges to both 18 and 143: {4100, 2055, 2057, 4110, 6159, 16, 10259, 2069, 27, 31, 16416, 33, 36, 2084, 39, 2088, 2097, 2103, 58, 62, 65, 67, 4165, 4167, 6215, 77, 79, 82, 83, 85, 4182, 10330, 91, 4187, 4188, 8288, 2144, 10338, 99, 12388, 4197, 14438, 4199, 2152, 4200, 4202, 107, 2157, 112, 18544, 119, 2167, 71801, 8316, 124, 126, 128, 8321, 10378, 140, 141, 2189, 6285, 145, 146, 8344, 2201, 172, 10413, 2226, 37043, 2231, 2232, 18620, 198, 4295, 12496, 212, 215, 217, 4313, 2270, 223, 225, 2277, 6374, 8423, 232, 2279, 30953, 6378, 4333, 4334, 239, 2287, 6382, 8434, 6383, 6393, 6397, 6402, 2308, 14600, 4368, 6419, 2329, 290, 291, 4394, 8491, 24877, 8495, 4406, 318, 320, 2369, 4418, 329, 340, 342, 343, 6487, 348, 8541, 356, 2410, 375, 379, 45439, 384, 385, 57728, 2433, 4482, 6527, 8592, 6555, 418, 8613, 2470, 2471, 431, 4527, 4537, 443, 445, 447, 14783, 4547, 20933, 2504, 460, 12750, 4564, 2525, 8671, 485, 6629, 2538, 8684, 2548, 2549, 4596, 2553, 508, 4606, 6656, 518, 12807, 8712, 12809, 12810, 8716, 2572, 2574, 12821, 8732, 547, 549, 552, 559, 6703, 18997, 2614, 570, 2623, 576, 4679, 16968, 586, 4685, 592, 2644, 2646, 10838, 29272, 6744, 602, 2651, 2653, 606, 607, 10846, 2656, 4706, 624, 625, 629, 630, 4725, 634, 6780, 637, 4736, 6784, 644, 645, 649, 8842, 12939, 6793, 653, 2704, 2706, 661, 663, 2713, 670, 671, 6814, 8867, 6820, 8871, 6824, 2729, 6829, 686, 2736, 697, 10937, 2750, 708, 6852, 733, 2786, 4834, 2788, 748, 749, 2796, 751, 759, 762, 4862, 775, 2826, 8974, 21262, 8976, 788, 789, 790, 8989, 805, 808, 13104, 6960, 4914, 6962, 821, 824, 2872, 15161, 828, 831, 4929, 21320, 2889, 4941, 4944, 4949, 29529, 862, 4962, 11114, 2931, 898, 19330, 9100, 909, 908, 19343, 7059, 2970, 925, 5027, 2980, 2992, 2995, 955, 960, 5056, 3011, 5060, 965, 970, 7114, 19406, 988, 3039, 997, 13289, 1006, 15350, 1017, 1025, 13325, 1059, 5164, 3117, 1075, 1077, 5175, 1084, 5181, 3138, 3142, 9306, 3174, 19561, 7273, 5229, 7289, 19583, 5248, 5263, 13455, 5266, 1171, 1173, 11413, 3226, 3234, 17581, 1209, 23738, 5309, 3262, 3268, 5318, 1224, 1225, 3277, 3278, 1234, 1238, 11479, 15575, 7394, 1252, 1254, 3306, 7408, 17649, 7418, 11521, 1283, 7429, 7430, 1288, 7435, 5393, 13588, 3352, 5412, 11560, 11568, 5425, 3379, 5427, 5428, 3386, 19790, 25939, 3412, 3418, 11616, 3425, 1379, 3431, 9582, 1404, 7549, 5503, 5505, 5510, 25993, 5514, 5518, 1423, 3474, 1428, 1434, 5534, 1440, 3488, 1443, 1445, 19885, 3502, 1455, 3507, 1460, 3514, 3515, 1469, 1471, 1473, 1474, 3523, 1480, 1483, 5580, 1491, 1492, 7637, 1494, 7643, 1501, 1505, 3557, 1510, 3564, 1517, 1518, 11756, 1520, 1522, 24052, 11765, 1526, 1528, 1529, 1532, 3582, 3584, 3586, 1539, 1541, 9738, 3594, 17933, 1551, 1552, 9746, 9748, 1562, 1564, 9762, 5667, 5679, 15941, 3673, 1626, 11868, 1629, 5724, 3679, 1634, 3683, 1636, 1647, 1650, 1657, 20090, 1661, 1663, 1665, 1669, 26247, 7817, 5776, 1683, 1684, 5782, 1690, 1692, 1694, 5792, 22180, 1701, 1702, 16036, 63147, 1716, 26292, 1718, 5812, 1720, 5813, 3774, 1727, 7872, 3780, 5831, 5832, 7881, 1738, 1748, 7896, 1754, 7900, 1767, 71400, 71403, 71404, 1778, 3827, 3828, 3830, 7935, 7949, 3854, 20241, 10002, 5908, 5916, 5917, 5918, 22300, 1831, 3879, 3880, 16169, 1835, 1836, 3887, 5939, 5941, 1846, 1847, 1849, 3898, 5951, 5952, 1857, 1859, 1860, 1863, 1865, 1866, 1867, 5967, 3921, 3923, 1876, 3925, 1885, 8032, 5992, 3952, 6001, 1912, 16250, 8060, 1918, 3968, 3985, 4001, 4003, 6053, 6055, 4010, 4017, 4038, 4040, 4045, 16334, 4051, 8149, 8158, 6114, 20452, 4080, 2033, 2037, 2043, 8188}
# Degree centrality
degree_centrality = nx.degree_centrality(core_G)
print("Top 20 nodes by degree centrality:", sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:20])
Top 20 nodes by degree centrality: [(18, 0.11935154454940004), (645, 0.08603523104416645), (634, 0.07237681899412815), (143, 0.0672708705642073), (737, 0.0667602757212152), (44, 0.06105863330780359), (763, 0.05548463960513999), (34, 0.05514424304314526), (790, 0.055101693472895924), (136, 0.05135733129095396), (1179, 0.05122968258020594), (27, 0.05008084418347374), (1516, 0.047315122117266614), (1719, 0.04676197770402519), (1, 0.04612373415028508), (118, 0.04480469747255553), (71399, 0.04429410262956344), (40, 0.04412390434856608), (4416, 0.04271976853033784), (31, 0.04259211981958982)]
reversed_core_G = core_G.reverse()
Let's do descriptive statistics for the core graph.
graph_stats(reversed_core_G)
Is Directed: True Total Nodes: 23503 Total Edges: 431564 Nodes in largest WCC: 23497 Edges in largest WCC: 431552 Nodes in largest SCC: 21498 Edges in largest SCC: 418634 Average clustering coefficient: 0.21951381653187882 Number of triangles: 738999 Diameter (longest shortest path): 14
plot_network(reversed_core_G, with_labels=False)
reverse_core_degree_data = compute_degree(reversed_core_G)
Mean degree: 36.72416287282475 Median degree: 10
Alright, they are the same values for the core graph
# Convert results to a pandas DataFrame for easier analysis and viewing
degree_df = pd.DataFrame(f'degree_data/{reverse_core_degree_data}')
degree_df.head() # Display the first few rows
# Save the DataFrame to a CSV file
degree_df.to_csv('reverse_core_node_degrees.csv', index=False)
Let's check the top 5 by in-degree and 5 top nodes by out-degree.
reverse_hubs = find_hubs(reverse_core_degree_data, 20, threshold=600)
# Results can also be used programmatically
print("Top by In-Degree:", reverse_hubs["top_in_degree"])
print("Top by Out-Degree:", reverse_hubs["top_out_degree"])
print("Nodes above threshold:", reverse_hubs["top_in_out_degree"])
Top by In-Degree: [(645, 1632, 390), (634, 1350, 351), (763, 1053, 251), (71399, 932, 109), (3924, 901, 92), (44, 808, 627), (637, 741, 254), (1059, 711, 69), (145, 614, 71), (824, 607, 183), (5232, 593, 76), (1669, 535, 29), (1225, 505, 92), (770, 499, 79), (34, 493, 803), (661, 489, 124), (3850, 483, 65), (629, 466, 136), (1596, 466, 193), (2704, 441, 132)] Top by Out-Degree: [(18, 44, 2761), (143, 143, 1438), (737, 370, 1199), (790, 102, 1193), (1179, 85, 1119), (136, 111, 1096), (1719, 46, 1053), (118, 123, 930), (4416, 106, 898), (780, 3, 889), (27, 334, 843), (128, 98, 822), (34, 493, 803), (1516, 311, 801), (40, 238, 799), (1, 319, 765), (28, 139, 761), (1619, 123, 720), (791, 36, 716), (401, 200, 695)] Nodes above threshold: [(44, 808, 627)]
The top 5 nodes by in-degree of the core graph now are the top 5 nodes by out-degree of the reversed graph! And vice-versa.
analize_hubs(reversed_core_G, 18, 143)
The hubs 18 and 143 are directly connected.
There are 1 nodes with outgoing edges to both 18 and 143: {735}
That's weird. Why in the core graph those two hubs were not directly connected, but in the reverse graph thery are?
# Degree centrality
degree_centrality = nx.degree_centrality(reversed_core_G)
print("Top 20 nodes by degree centrality:", sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:20])
Top 20 nodes by degree centrality: [(18, 0.11935154454940004), (645, 0.08603523104416645), (634, 0.07237681899412815), (143, 0.0672708705642073), (737, 0.0667602757212152), (44, 0.06105863330780359), (763, 0.05548463960513999), (34, 0.05514424304314526), (790, 0.055101693472895924), (136, 0.05135733129095396), (1179, 0.05122968258020594), (27, 0.05008084418347374), (1516, 0.047315122117266614), (1719, 0.04676197770402519), (1, 0.04612373415028508), (118, 0.04480469747255553), (71399, 0.04429410262956344), (40, 0.04412390434856608), (4416, 0.04271976853033784), (31, 0.04259211981958982)]
This function extract a subgraph around a node. It is interesting to see a subgraph around the node that has the highest in-degree, i.e. 18.
# Extract a subgraph around a node (e.g., ego graph)
node_of_interest = 0
subgraph = nx.ego_graph(G, node_of_interest, radius=2)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest}")
plt.show()
node_of_interest = 18
subgraph = nx.ego_graph(G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Original Graph")
plt.show()
node_of_interest = 18
subgraph = nx.ego_graph(core_G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Core Graph")
plt.show()
node_of_interest = 18
subgraph = nx.ego_graph(reversed_graph, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Reversed Graph")
plt.show()
node_of_interest = 143
subgraph = nx.ego_graph(G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Original Graph")
plt.show()
node_of_interest = 143
subgraph = nx.ego_graph(core_G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Core Graph")
plt.show()
node_of_interest = 143
subgraph = nx.ego_graph(reversed_graph, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Reversed Graph")
plt.show()
node_of_interest = 645
subgraph = nx.ego_graph(G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Original Graph")
plt.show()
node_of_interest = 645
subgraph = nx.ego_graph(core_G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Core Graph")
plt.show()
node_of_interest = 645
subgraph = nx.ego_graph(reversed_graph, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Reversed Graph")
plt.show()
node_of_interest = 634
subgraph = nx.ego_graph(G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Original Graph")
plt.show()
node_of_interest = 634
subgraph = nx.ego_graph(core_G, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Core Graph")
plt.show()
node_of_interest = 634
subgraph = nx.ego_graph(reversed_graph, node_of_interest, radius=1)
# Plot the subgraph
plt.figure(figsize=(10, 10))
nx.draw(subgraph, with_labels=True, node_size=50, font_size=8)
plt.title(f"Ego Graph of Node {node_of_interest} with radius=1 - Reversed Graph")
plt.show()
edge_to_highlight = (18, 143)
pos = nx.spring_layout(core_G)
nx.draw_networkx_edges(core_G, pos, edgelist=[edge_to_highlight], edge_color="red", width=2)
# Disegna le etichette dei nodi
nx.draw_networkx_labels(core_G, pos, font_size=12, font_color="black")
# Mostra il grafico
plt.title("Arco evidenziato: " + str(edge_to_highlight))
plt.show()
## Adiacent Heatmap
import numpy as np
import seaborn as sns
# Create adjacency matrix (convert to numpy)
A = nx.to_numpy_array(core_G)
# Plot heatmap of a subset (top 500 nodes)
subset = A[:500, :500]
sns.heatmap(subset, cmap='viridis')
plt.title("Adjacency Matrix Heatmap (Subset)")
plt.show()
nx.draw_networkx(G, node_size=1000, node_color='lightblue', with_labels=True, font_size=8)
class Simulation:
def __init__(self, G, initial_state_func, state_transition_func, stop_condition_func, max_steps=100, name=''):
self.G = G
self.initial_state_func = initial_state_func
self.state_transition_func = state_transition_func
self.stop_condition_func = stop_condition_func
self.max_steps = max_steps
self.name = name
self.current_state = self.initial_state_func(G)
self.states = [self.current_state]
self.step_count = 0
self.state_changes = [] # List to store the number of nodes that change state at each step
def run(self):
"""Run the simulation until the stop condition is met or max steps."""
while self.step_count < self.max_steps:
previous_state = self.current_state.copy()
self.current_state = self.state_transition_func(self.G, self.current_state)
# Calculate the number of nodes that changed state
changes = sum(1 for node in self.G if self.current_state[node] != previous_state[node])
self.state_changes.append(changes)
self.states.append(self.current_state)
self.step_count += 1
if self.stop_condition_func(self.G, self.current_state, previous_state):
print(f"Simulation stopped at step {self.step_count}.")
break
def plot(self):
"""Plot the proportions of node states over time."""
counts = [Counter(state.values()) for state in self.states]
steps = range(len(counts))
for state_value in {'awake', 'asleep'}:
series = [count.get(state_value, 0) / len(self.G) for count in counts]
plt.plot(steps, series, label=state_value)
plt.title(f'{self.name} State Proportions Over Time')
plt.xlabel('Simulation Step')
plt.ylabel('Proportion of Nodes')
plt.legend()
plt.grid()
plt.show()
def plot_state_changes(self):
"""Plot the number of state changes at each step."""
steps = range(len(self.state_changes))
plt.plot(steps, self.state_changes, label='State Changes')
plt.title(f'{self.name} State Changes Per Step')
plt.xlabel('Simulation Step')
plt.ylabel('Number of Nodes Changing State')
plt.grid()
plt.legend()
plt.show()
# initial state function
def initial_state_hubs_awake(G, top_k_hubs=10):
"""Initializes hubs as 'awake' and others as 'asleep'."""
state = {node: 'asleep' for node in G.nodes}
# Identify top-k hubs by degree
hubs = sorted(G.degree, key=lambda x: x[1], reverse=True)[:top_k_hubs]
for hub, _ in hubs:
state[hub] = 'awake'
return state
# Define the synchronous state transition function
P_AWAKEN = 0.2 # Probability of a node becoming awake
def state_transition(G, current_state):
"""Synchronous state transition for the simulation."""
next_state = current_state.copy()
for node in G.nodes:
if current_state[node] == 'asleep':
neighbors_awake = any(current_state[neighbor] == 'awake' for neighbor in G.neighbors(node))
if neighbors_awake or random.random() < P_AWAKEN:
next_state[node] = 'awake'
return next_state
# Define the stop condition
def stop_condition(G, current_state, previous_state):
"""Stops if no state changes between steps or maximum steps reached."""
return current_state == previous_state
# Synchronous simulation
sim_sync = Simulation(
reversed_core_G,
initial_state_func=lambda g: initial_state_hubs_awake(reversed_core_G, top_k_hubs=10),
state_transition_func=state_transition,
stop_condition_func=stop_condition,
max_steps=100,
name='Synchronous Simulation'
)
sim_sync.run()
sim_sync.plot_state_changes()
sim_sync.plot()
Simulation stopped at step 24.
It will be created random partitions (3 and 9) of both the full graph and the core graph.
# 3 partitions from original graph (75879 nodes)
random_nodes = random.sample(list(G.nodes), 25293)
random_partition = [set(random_nodes), set(G.nodes) - set(random_nodes)]
if nx.community.is_partition(G, random_partition) == True:
print("3 random partition of the full graph were obtained, each one with 25293 nodes.")
# 9 partition from original graph
random_nodes_2 = random.sample(list(G.nodes), 8431)
random_partition_2 = [set(random_nodes_2), set(G.nodes) - set(random_nodes_2)]
if nx.community.is_partition(G, random_partition_2) == True:
print("9 random partition of the full graph were obtained, each one with 8431.")
# 3 partitions from core graph (23503 nodes)
random_nodes_3 = random.sample(list(core_G.nodes), 7834)
random_partition_3 = [set(random_nodes_3), set(core_G.nodes) - set(random_nodes_3)]
if nx.community.is_partition(core_G, random_partition_3) == True:
print("3 random partition of the core graph were obtained, around 7834 nodes in each partitions.")
# 9 partition from core graph
random_nodes_4 = random.sample(list(core_G.nodes), 2611)
random_partition_4 = [set(random_nodes_4), set(core_G.nodes) - set(random_nodes_4)]
if nx.community.is_partition(core_G, random_partition_4) == True:
print("9 random partition of the core graph were obtained, around 2611 nodes in each partitions.")
# 27 partition from core graph
random_nodes_5 = random.sample(list(core_G.nodes), 870)
random_partition_5 = [set(random_nodes_5), set(core_G.nodes) - set(random_nodes_5)]
if nx.community.is_partition(core_G, random_partition_5) == True:
print("27 random partition of the core graph were obtained, around 870 nodes in each partitions.")
3 random partition of the full graph were obtained, each one with 25293 nodes. 9 random partition of the full graph were obtained, each one with 8431. 3 random partition of the core graph were obtained, around 7834 nodes in each partitions. 9 random partition of the core graph were obtained, around 2611 nodes in each partitions. 27 random partition of the core graph were obtained, around 870 nodes in each partitions.
#This function verify if the nodes are in the same partitions:
def make_partition_map(partition):
partition_map = {}
for idx, cluster_nodes in enumerate(partition):
for node in cluster_nodes:
partition_map[node] = idx
return partition_map
pmap = make_partition_map(random_partition)
pmap2 = make_partition_map(random_partition_2)
pmap3 = make_partition_map(random_partition_3)
pmap4 = make_partition_map(random_partition_4)
pmap5 = make_partition_map(random_partition_5)
# Let's control for the interesting nodes: (well, there are random partitions, is not particularly interesting)
print(pmap[18] == pmap[143])
print(pmap2[18] == pmap2[143])
print(pmap3[18] == pmap3[143])
print(pmap4[18] == pmap4[143])
print(pmap5[18] == pmap5[143])
False True False True True
The modularity of a graph partition compares the number of intra-group edges with a random baseline. Higher modularity scores correspond to a higher proportion of intra-group edges, therefore fewer inter-group edges and better separation of groups.
$$ Q_w=\frac{1}{W}\sum_C \left(W_C-\frac{s_C^2}{4W}\right) $$where
# Modularity of first partition of the full graph
print("Modularity %7.4f" % nx.community.quality.modularity(G, random_partition))
# Modularity of second partition of the full graph
print("Modularity %7.4f" % nx.community.quality.modularity(G, random_partition_2))
# Modularity of first partition of the core graph
print("Modularity %7.4f" % nx.community.quality.modularity(core_G, random_partition_3))
# Modularity of second partition of the core graph
print("Modularity %7.4f" % nx.community.quality.modularity(core_G, random_partition_4))
# Modularity of third partition of the core graph
print("Modularity %7.4f" % nx.community.quality.modularity(core_G, random_partition_5))
Modularity -0.0003 Modularity -0.0010 Modularity -0.0011 Modularity -0.0001 Modularity -0.0001
Since this is a random process the modularity won't be exactly zero, but it should be fairly close.
Now it will be created the adjaceny and laplacian matrix from the set of nodes.
def compute_graph_matrices(core_graph):
"""
Computes the adjacency, degree, and Laplacian matrices for a core graph.
Parameters:
- core_graph: A NetworkX DiGraph (the core graph)
Returns:
- adjacency_matrix: A NumPy array (core_graph size x core_graph size)
- degree_matrix: A sparse diagonal matrix (scipy.sparse)
- laplacian_matrix: A sparse Laplacian matrix (scipy.sparse)
- core_nodes: A list of node labels in the order they appear in the matrix
"""
# Get the nodes of the core graph sorted by their original labels
core_nodes = sorted(core_graph.nodes)
node_mapping = {node: idx for idx, node in enumerate(core_nodes)}
# Initialize adjacency matrix of size (core_graph size x core_graph size)
size = len(core_nodes)
adjacency_matrix = np.zeros((size, size))
# Populate the adjacency matrix
for u, v in core_graph.edges:
adjacency_matrix[node_mapping[u], node_mapping[v]] = 1 # Directed edge from u to v
# Compute the degree for each node (sum of rows in the adjacency matrix)
degrees = np.sum(adjacency_matrix, axis=1)
# Create the degree matrix as a sparse diagonal matrix
degree_matrix = diags(degrees, offsets=0, format="csr")
# Compute the Laplacian matrix
laplacian_matrix = degree_matrix - csr_matrix(adjacency_matrix)
return adjacency_matrix, degree_matrix, laplacian_matrix, core_nodes
adjacency_matrix, degree_matrix, laplacian_matrix, core_nodes = compute_graph_matrices(core_G)
print(laplacian_matrix.shape)
(23503, 23503)
And now eigenvalues and eigenvectors of Laplacian matrix
the eigenvalues and eigenvector are already compute and they're stored in the .npz file
# Number of eigenvalues/vectors to compute
k = core_G.number_of_nodes()
start_time = time.time()
# Compute eigenvalues and eigenvectors using ARPACK via eigsh
eigenvalues, eigenvectors = linalg.eigsh(laplacian_matrix.toarray(), k=k, which='SM') # SM: smallest magnitude
elapsed_time = time.time() - start_time
print (elapsed_time)
# Output results
# print("Eigenvalues:", eigenvalues)
print("Eigenvectors (shape):", eigenvectors.shape)
eigenvectors = eigenvectors.astype(np.float32)
eigenvalues = eigenvalues.astype(np.float32)
# Sort eigenvalues and corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]
# Save the results to a file
np.savez_compressed("eigenvalues_eigenvectors.npz", eigenvalues=eigenvalues, eigenvectors=eigenvectors)
print("Eigenvalues and eigenvectors have been saved to 'eigenvalues_eigenvectors.npz'")
Restore eigenvalues and eigenvectors
data = np.load("eigenvalues_eigenvectors.npz")
eigenvalues = data['eigenvalues'].astype(np.float32)
eigenvectors = data['eigenvectors'].astype(np.float32)
Eigenvalues plots
# Plotting the Eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(eigenvalues)), eigenvalues, marker='o', linestyle='-')
plt.title('Sorted Eigenvalues of the Laplacian Matrix')
plt.xlabel('Index')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
# Plot the Fiedler Vector (2nd smallest eigenvector, index 1)
fiedler_vector = eigenvectors[:, 1] # Second smallest eigenvector
plt.figure(figsize=(10, 6))
plt.plot(fiedler_vector, marker='o', linestyle='-')
plt.title('Fiedler Vector')
plt.xlabel('Node Index')
plt.ylabel('Value')
plt.grid()
plt.show()
# Plot the spectral gap
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(eigenvalues)), np.diff(eigenvalues), marker='o', linestyle='-')
plt.title("Spectral Gap Analysis")
plt.xlabel("Eigenvalue Index")
plt.ylabel("Difference Between Consecutive Eigenvalues")
plt.grid()
plt.show()
# Spectral Clustering
num_clusters = 5 # choosen by looking the big gaps between consecutive eigenvalues
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(eigenvectors[:, :num_clusters])
# Save cluster assignments as a CSV file
cluster_df = pd.DataFrame({
"Node": np.arange(len(clusters)), # Node indices
"Cluster": clusters # Assigned cluster for each node
})
# Replace the "Node" values with the corresponding values from the array
cluster_df['Node'] = cluster_df['Node'].apply(lambda x: core_nodes[x])
cluster_df.to_csv(f'kmeans/{num_clusters}_cluster_assignments.csv', index=False)
print(f'Cluster assignments saved to kmeans/{num_clusters}_cluster_assignments.csv')
Cluster assignments saved to kmeans/5_cluster_assignments.csv
cluster_df = pd.read_csv(f'kmeans/{num_clusters}_cluster_assignments.csv')
def analyze_clusters(graph, cluster_assignments):
"""
Analyzes cluster properties such as size, density, and modularity.
Parameters:
- graph: A NetworkX graph object representing the graph.
- cluster_assignments: A DataFrame containing 'Node' and 'Cluster' columns.
Returns:
- cluster_analysis: A dictionary with cluster properties.
"""
# Create a dictionary mapping nodes to their clusters
cluster_mapping = dict(zip(cluster_assignments['Node'], cluster_assignments['Cluster']))
# Group nodes by clusters
cluster_groups = cluster_assignments.groupby('Cluster')['Node'].apply(list)
cluster_analysis = {}
total_edges = graph.number_of_edges()
# Analyze each cluster
for cluster_id, nodes in cluster_groups.items():
subgraph = graph.subgraph(nodes) # Subgraph for the cluster
num_nodes = subgraph.number_of_nodes()
num_edges = subgraph.number_of_edges()
density = nx.density(subgraph) # Density of the cluster
# Contribution to modularity
cluster_edge_fraction = num_edges / total_edges
modularity_contribution = cluster_edge_fraction - (len(nodes) / graph.number_of_nodes())**2
# Store results
cluster_analysis[cluster_id] = {
'num_nodes': num_nodes,
'num_edges': num_edges,
'density': density,
'modularity_contribution': modularity_contribution
}
return cluster_analysis
# Analyze clusters using the `cluster_df` DataFrame
cluster_properties = analyze_clusters(core_G, cluster_df)
# Display the results
for cluster_id, properties in cluster_properties.items():
print(f"Cluster {cluster_id}:")
print(f" Nodes: {properties['num_nodes']}")
print(f" Edges: {properties['num_edges']}")
print(f" Density: {properties['density']:.4f}")
print(f" Modularity Contribution: {properties['modularity_contribution']:.4f}")
print()
# Extract cluster sizes for visualization
cluster_sizes = [properties['num_nodes'] for cluster_id, properties in cluster_properties.items()]
plt.figure(figsize=(10, 6))
plt.bar(range(len(cluster_sizes)), cluster_sizes)
plt.title("Cluster Size Distribution")
plt.xlabel("Cluster ID")
plt.ylabel("Number of Nodes")
plt.xticks(range(len(cluster_sizes)), labels=cluster_properties.keys()) # Add cluster IDs as x-axis labels
plt.grid()
plt.show()
Cluster 0: Nodes: 1584 Edges: 15378 Density: 0.0061 Modularity Contribution: 0.0311 Cluster 1: Nodes: 17364 Edges: 89135 Density: 0.0003 Modularity Contribution: -0.3393 Cluster 2: Nodes: 1093 Edges: 1405 Density: 0.0012 Modularity Contribution: 0.0011 Cluster 3: Nodes: 1 Edges: 0 Density: 0.0000 Modularity Contribution: -0.0000 Cluster 4: Nodes: 3461 Edges: 156360 Density: 0.0131 Modularity Contribution: 0.3406
layout = nx.spring_layout(core_G, seed=42)
# Map clusters to their corresponding nodes
cluster_mapping = dict(zip(cluster_df['Node'], cluster_df['Cluster']))
# Extract positions and cluster colors
node_positions = {node: layout[node] for node in core_nodes} # Positions of all nodes
node_colors = [cluster_mapping[node] for node in core_nodes] # Colors based on cluster assignment
# Plot the graph with nodes colored by their cluster
plt.figure(figsize=(12, 8))
nx.draw_networkx_edges(core_G, pos=node_positions, alpha=0.5, edge_color="gray") # Draw edges
nx.draw_networkx_nodes(
core_G,
pos=node_positions,
nodelist=core_nodes,
node_color=node_colors,
cmap=plt.cm.get_cmap("tab10"),
node_size=20
)
plt.title("Graph Clusters Visualization")
plt.axis("off") # Turn off the axis for better visualization
plt.show()
dl_obj = dl.Community("core_g_edge_list.txt", precision=0.0001, gamma=1, reproducibility=False, renumbering=True, random=False)
import contextlib
output=open("core_g_hierarchical", "w")
with contextlib.redirect_stdout(output):
dl_obj.run(verbose=True)
output.close()
level 0: network size: 75879 nodes, 508841 arcs, 584717 weight. modularity increased from -0.0171296 to 0.372264 level 1: network size: 8148 nodes, 34988 arcs, 584717 weight. modularity increased from 0.372264 to 0.521522 level 2: network size: 2785 nodes, 11884 arcs, 584717 weight. modularity increased from 0.521522 to 0.523204 level 3: network size: 2102 nodes, 7390 arcs, 584717 weight. modularity increased from 0.523204 to 0.523442 level 4: network size: 1972 nodes, 6110 arcs, 584717 weight. modularity increased from 0.523442 to 0.523455 level 5: network size: 1959 nodes, 5933 arcs, 584717 weight. modularity increased from 0.523455 to 0.523456 level 6: network size: 1958 nodes, 5917 arcs, 584717 weight. modularity increased from 0.523456 to 0.523456
It didn't work on python for a problem of the shared object file, which gives a segmentation fault error. Anyway, I runned it on c++ and I get the result, so we just need to import the file
# output=open("nodecomm.txt", "w")
# with contextlib.redirect_stdout(output):
# dl_obj.print_level(-2)
# output.close()
def com_histogram(filename, graphname):
# Load the data
file_path = filename
data = pd.read_csv(file_path, sep=r'\s+', header=None, names=["node", "community"])
# Count the occurrences of each community
community_counts = data["community"].value_counts()
# Plot the histogram
plt.figure(figsize=(10, 6))
plt.bar(community_counts.index, community_counts.values, color='skyblue', edgecolor='black')
plt.xlabel("Community")
plt.ylabel("Frequency")
plt.title(f"Number of Nodes per Community in {graphname}")
plt.xticks(community_counts.index)
plt.show()
# Get the histogram for the full dataset
com_histogram("full_to_comm.txt", "Full dataset")
# Get the histogram for the core dataset
com_histogram("core_to_comm.txt", "Subset")
def get_info(filename, graphname):
# Load the data
file_path = filename
data = pd.read_csv(file_path, sep=r'\s+', header=None, names=["node", "community"])
# Get the number of unique communities
num_communities = data["community"].nunique()
# Get the top 10 communities by size
top_communities = data["community"].value_counts().head(10)
# Output the results
print(f"Total number of communities in {graphname}: {num_communities}")
top_communities_df = top_communities.reset_index()
top_communities_df.columns = ["Community", "Node Count"]
print(f"\nTop 10 communities in {graphname} and their sizes:")
print(top_communities_df)
# Get the information about the communities for full dataset:
get_info("full_to_comm.txt", "full dataset")
# Get the information about the communities for subset:
get_info("core_to_comm.txt", "subset")
Total number of communities in full dataset: 1151 Top 10 communities in full dataset and their sizes: Community Node Count 0 146 22180 1 3 15652 2 194 6873 3 82 2648 4 42 2453 5 1 2451 6 521 1462 7 377 1187 8 433 1176 9 490 1154 Total number of communities in subset: 172 Top 10 communities in subset and their sizes: Community Node Count 0 16 6952 1 1 4506 2 33 3824 3 5 2331 4 125 2269 5 27 621 6 89 351 7 52 251 8 13 251 9 8 200
def get_comm(filename, graphname, node_list):
# Load the data
file_path = filename
data = pd.read_csv(file_path, sep=r'\s+', header=None, names=["node", "community"])
# Filter the data for the specific nodes
result = data[data["node"].isin(node_list)]
# Output the results
print(f"Communities for specific nodes in {graphname}:")
print(result)
# Get communities for top in degree nodes for full dataset:
get_comm("full_to_comm.txt", "full dataset", top_indegree_nodes_full)
# Get communities for top in degree nodes for subset:
get_comm("core_to_comm.txt", "subset", top_indegree_nodes_core)
# Get communities for top out degree nodes for full dataset:
get_comm("full_to_comm.txt", "full dataset", top_outdegree_nodes_full)
# Get communities for top out degree nodes for subset:
get_comm("core_to_comm.txt", "subset", top_outdegree_nodes_core)
# compute the intra and inter community trust
def trust_links(filename, graphname, graph):
# Load the data
file_path = filename
data = pd.read_csv(file_path, sep=r'\s+', header=None, names=["node", "community"])
communities = dict(zip(data["node"], data["community"]))
intra_edges = []
inter_edges = []
for u, v in graph.edges():
if communities[u] == communities[v]:
intra_edges.append((u, v)) # Intra-community edge
else:
inter_edges.append((u, v)) # Inter-community edge
counts = [len(intra_edges), len(inter_edges)]
labels = ['Intra-Community Trust', 'Inter-Community Trust']
plt.bar(labels, counts, color=['skyblue', 'orange'])
plt.title(f"Intra vs. Inter-Community Trust {graphname}")
plt.ylabel("Number of Trust Relationships")
plt.show()
# Compute intra- and intercommunity trust for full dataset
trust_links("full_to_comm.txt", "full dataset", G)
# Compute intra- and intercommunity trust for subset
trust_links("core_to_comm.txt", "subset", core_G)